In [1]:
#Import required packages
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
In [2]:
def format_date(df_date):
"""
Splits Meeting Times and Dates into datetime objects where applicable using regex.
"""
df_date['Days'] = df_date['Meeting_Times'].str.extract('([^\s]+)', expand=True)
df_date['Start_Date'] = df_date['Meeting_Dates'].str.extract('([^\s]+)', expand=True)
df_date['Year'] = df_date['Term'].astype(str).str.slice(0,4)
df_date['Quarter'] = df_date['Term'].astype(str).str.slice(4,6)
df_date['Term_Date'] = pd.to_datetime(df_date['Year'] + df_date['Quarter'], format='%Y%m')
df_date['End_Date'] = df_date['Meeting_Dates'].str.extract('(?<=-)(.*)(?= )', expand=True)
df_date['Start_Time'] = df_date['Meeting_Times'].str.extract('(?<= )(.*)(?=-)', expand=True)
df_date['Start_Time'] = pd.to_datetime(df_date['Start_Time'], format='%H%M')
df_date['End_Time'] = df_date['Meeting_Times'].str.extract('((?<=-).*$)', expand=True)
df_date['End_Time'] = pd.to_datetime(df_date['End_Time'], format='%H%M')
df_date['Duration_Hr'] = ((df_date['End_Time'] - df_date['Start_Time']).dt.seconds)/3600
return df_date
def format_xlist(df_xl):
"""
revises % capacity calculations by using Max Enrollment instead of room capacity.
"""
df_xl['Cap_Diff'] = np.where(df_xl['Xlst'] != '',
df_xl['Max_Enrl'].astype(int) - df_xl['Actual_Enrl'].astype(int),
df_xl['Room_Capacity'].astype(int) - df_xl['Actual_Enrl'].astype(int))
df_xl = df_xl.loc[df_xl['Room_Capacity'].astype(int) < 999]
return df_xl
In [3]:
pd.set_option('display.max_rows', None)
df = pd.read_csv('data/PSU_master_classroom_91-17.csv', dtype={'Schedule': object, 'Schedule Desc': object})
df = df.fillna('')
df = format_date(df)
# Avoid classes that only occur on a single day
df = df.loc[df['Start_Date'] != df['End_Date']]
#terms = [199104, 199204, 199304, 199404, 199504, 199604, 199704, 199804, 199904, 200004, 200104, 200204, 200304, 200404, 200504, 200604, 200704, 200804, 200904, 201004, 201104, 201204, 201304, 201404, 201504, 201604]
#terms = [200604, 200704, 200804, 200904, 201004, 201104, 201204, 201304, 201404, 201504, 201604]
#df = df.loc[df['Term'].isin(terms)]
df = df.loc[df['Online Instruct Method'] != 'Fully Online']
#dept_lst = ['MTH', 'CH', 'BI', 'CE', 'CS', 'ECE', 'EMGT' ]
#df = df.loc[df['Dept'].isin(dept_lst)]
# Calculate number of days per week and treat Sunday condition
df['Days_Per_Week'] = df['Days'].str.len()
df['Room_Capacity'] = df['Room_Capacity'].apply(lambda x: x if (x != 'No Data Available') else 0)
df['Building'] = df['ROOM'].str.extract('([^\s]+)', expand=True)
df_cl = format_xlist(df)
df_cl['%_Empty'] = df_cl['Cap_Diff'].astype(float) / df_cl['Room_Capacity'].astype(float)
# Normalize the results
df_cl['%_Empty'] = df_cl['Actual_Enrl'].astype(np.float32)/df_cl['Room_Capacity'].astype(np.float32)
df_cl = df_cl.replace([np.inf, -np.inf], np.nan).dropna()
In [4]:
from sklearn.preprocessing import LabelEncoder
df_cl = df_cl.sample(n = 80000)
# Save as a 1D array. Otherwise will throw errors.
y = np.asarray(df_cl['%_Empty'], dtype="|S6")
cols = df_cl[['Dept', 'Days', 'Start_Time', 'ROOM', 'Quarter', 'Room_Capacity', 'Building', 'Class', 'Instructor', 'Schedule', 'Max_Enrl']]
cat_columns = ['Dept', 'Days', 'Class', 'Start_Time', 'ROOM', 'Building', 'Instructor', 'Schedule']
#cols = df_cl[['Start_Time', 'Class', 'Instructor' ]]
#cat_columns = ['Start_Time', 'Class', 'Instructor']
for column in cat_columns:
categorical_mapping = {label: idx for idx, label in enumerate(np.unique(cols['{0}'.format(column)]))}
cols['{0}'.format(column)] = cols['{0}'.format(column)].map(categorical_mapping)
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
if Version(sklearn_version) < '0.18':
from sklearn.cross_validation import train_test_split
else:
from sklearn.model_selection import train_test_split
X = cols.iloc[:, :].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
Utilize Random Forests Method to determine feature importances. On the left, trees are trained independently by recursive binary partitioning of a bootstrapped sample of the input data, X . On the right, test data is dropped down through each tree and the response estimate is the average over the all the individual predictions in the forest.
In [5]:
from sklearn.ensemble import RandomForestClassifier
feat_labels = cols.columns[:]
forest = RandomForestClassifier(n_estimators=20,
random_state=0,
n_jobs=-1) # -1 sets n_jobs=n_CPU cores
forest.fit(X_train, y_train)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
print("%2d) %-*s %f" % (f + 1, 30,
feat_labels[indices[f]],
importances[indices[f]]))
plt.title('Feature Importances')
plt.bar(range(X_train.shape[1]),
importances[indices],
color='lightblue',
align='center')
plt.xticks(range(X_train.shape[1]),
feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
plt.show()
In [6]:
from sklearn.metrics import accuracy_score
forest = RandomForestClassifier(n_estimators=20,
random_state=0,
n_jobs=-1) # -1 sets n_jobs=n_CPU cores
forest.fit(X_train, y_train)
y_predict = forest.predict(X)
y_actual = y
print(accuracy_score(y_actual, y_predict))